Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
#some ideas taken from here https://www.bioinformatics.babraham.ac.uk/training/10XRNASeq/seurat_workflow.html
# install.packages(c("tidyverse", "biomaRt","ggthemes","data.table","patchwork","Seurat", "reshape2"))
#load libraries
library(dplyr)
library(Seurat)
library(patchwork)
library(data.table)
library(stringr)
library(ggplot2)
library("biomaRt")
library(ggthemes)
library(reshape2)
library(tidyverse)
library(RColorBrewer)
library(ggsci)
library(scCustomize)
set.seed(42)
#set working directory
setwd("~/OneDrive - Queen Mary, University of London/QMUL/Lab/Coding/data/R/Seurat/SandLKerato")
#load the dataset from the raw data downloaded
kerato.data <- Read10X(data.dir = "~/OneDrive - Queen Mary, University of London/QMUL/Lab/Coding/data/R/Seurat/SandLKerato/rawdata/")
#initialise the seurat object with the raw (non-normalised) data.
kerato <- CreateSeuratObject(counts = kerato.data, project = "Kerato", min.cells = 3, min.features = 100)
Warning: Feature names cannot have underscores ('_'), replacing with dashes ('-')
kerato
# A Seurat-tibble abstraction: 299 × 4
# Features=10026 | Cells=299 | Active assay=RNA | Assays=RNA
.cell orig.ident nCount_RNA nFeature_RNA
<chr> <fct> <dbl> <int>
1 AAACGGGCAGGGTTAG-1 Kerato 2540 889
2 AAAGATGCAGTCGTGC-1 Kerato 19948 3642
3 AAAGATGGTCTAACGT-1 Kerato 522 364
4 AAATGCCCAGGAATGC-1 Kerato 6345 1834
5 AACACGTGTTAAGATG-1 Kerato 170 137
6 AACCATGCAACCGCCA-1 Kerato 263 203
7 AACCGCGCAGATAATG-1 Kerato 7711 1582
8 AACTCCCGTAATAGCA-1 Kerato 167 139
9 AACTCTTCAGGTTTCA-1 Kerato 310 202
10 AACTCTTGTAATTGGA-1 Kerato 205 139
# … with 289 more rows
# ℹ Use `print(n = ...)` to see more rows
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats. Added percentage of mitochondrial RNA per barcode to 'percent.mt'.
grep("^Mt-",rownames(kerato@assays$RNA@counts),value = TRUE)
[1] "Mt-nd1" "Mt-nd2" "Mt-co1" "Mt-co2" "Mt-atp8" "Mt-atp6"
[7] "Mt-co3" "Mt-nd3" "Mt-nd4l" "Mt-nd4" "Mt-nd5" "Mt-nd6"
[13] "Mt-cyb"
kerato[["percent.mt"]] <- PercentageFeatureSet(kerato, pattern = "^Mt-")
# show example metadata present.
head(kerato@meta.data, 5)
NA
NA
#ribosomal genes
grep("^Rp[ls]",rownames(kerato@assays$RNA@counts),value = TRUE)
[1] "Rps12" "Rps9" "Rps5" "Rpl28" "Rpl9"
[6] "Rps19" "Rps16" "Rps11" "Rpl13a" "Rpl18"
[11] "Rps17" "Rps3" "Rpl27a" "Rps15a" "Rplp2"
[16] "Rps6kb2" "Rps6ka4" "Rps23" "Rpl22l1" "Rps3a"
[21] "Rps27" "Rpl38" "Rplp0l1" "Rpl34" "Rps4x"
[26] "Rpl32" "Rpl12" "Rpl35" "Rpl21" "Rps21"
[31] "Rpl7" "Rps20" "Rps6" "Rps8" "Rps27a"
[36] "Rps6ka1" "Rpl11" "Rpl21.1" "Rps6ka3" "Rps4x.1"
[41] "Rps6ka6" "Rpl10" "Rps10l1" "Rpl32.1" "Rpl41"
[46] "Rps15" "Rps28" "Rpl3" "Rps19bp1" "Rps25"
[51] "Rpl4" "Rps27l" "Rpl29" "Rpsa" "Rpl14"
[56] "Rpl36" "Rpl7l1" "Rpl31" "Rpl37" "Rpl37a"
[61] "Rpl5" "Rps27a.1" "Rpl30l1" "Rps6kc1" "Rps2"
[66] "Rpl30" "Rpl26" "Rpl36a" "Rpl23a" "Rps6kb1"
[71] "Rpl23" "Rpl19" "Rpl27" "Rps18" "Rpl15"
[76] "Rps24" "Rpl18a" "Rpl24" "Rpl36al" "Rps14"
[81] "Rpl17" "Rpl13" "Rps18l1" "Rps10" "Rpl10a"
[86] "Rpl35al1" "Rpl35al1.1" "Rpl21.3" "Rpl6" "Rplp0"
kerato[["percent.ribosomal"]] <- PercentageFeatureSet(kerato,pattern="^Rp[ls]")
head(kerato$percent.ribosomal)
AAACGGGCAGGGTTAG-1 AAAGATGCAGTCGTGC-1 AAAGATGGTCTAACGT-1
27.598425 22.563666 24.137931
AAATGCCCAGGAATGC-1 AACACGTGTTAAGATG-1 AACCATGCAACCGCCA-1
6.603625 27.058824 23.954373
# Visualize QC metrics as a violin plot
VlnPlot(kerato, features = c("nFeature_RNA", "nCount_RNA"))
VlnPlot(kerato, features = c("percent.mt", "percent.ribosomal"))
VlnPlot(kerato, features = c("nFeature_RNA", "nCount_RNA")) + scale_y_log10()
Scale for 'y' is already present. Adding another scale for 'y', which
will replace the existing scale.
VlnPlot(kerato, features = c("percent.mt", "percent.ribosomal")) + scale_y_log10()
Scale for 'y' is already present. Adding another scale for 'y', which
will replace the existing scale.
#In this example we run apply over the columns (cells) and calculate what percentage of the data comes from the single most observed gene. Again, having a high proportion of your data dominated by a single gene would be a concerning sign. We will also look later at the specific most highly expressed genes.
kerato[rownames(kerato) != "Malat1",] -> kerato.nomalat
apply(
kerato.nomalat@assays$RNA@counts,
2,
max
) -> kerato.nomalat$largest_count
apply(
kerato.nomalat@assays$RNA@counts,
2,
which.max
) -> kerato.nomalat$largest_index
rownames(kerato.nomalat)[kerato.nomalat$largest_index] -> kerato.nomalat$largest_gene
100 * kerato.nomalat$largest_count / kerato.nomalat$nCount_RNA -> kerato.nomalat$percent.Largest.Gene
kerato$largest_gene <- kerato.nomalat$largest_gene
kerato$percent.Largest.Gene <- kerato.nomalat$percent.Largest.Gene
#
# rm(kerato.nomalat) #will remove the nomalat columns
#no malat cells not removed due to reducing levels too much.
kerato
# A Seurat-tibble abstraction: 299 × 8
# Features=10026 | Cells=299 | Active assay=RNA | Assays=RNA
.cell orig.…¹ nCoun…² nFeat…³ perce…⁴ perce…⁵ large…⁶ perce…⁷
<chr> <fct> <dbl> <int> <dbl> <dbl> <chr> <dbl>
1 AAACGGGCAGGG… Kerato 2540 889 17.1 27.6 Mt-atp6 5.71
2 AAAGATGCAGTC… Kerato 19948 3642 9.15 22.6 Mt-atp6 2.33
3 AAAGATGGTCTA… Kerato 522 364 1.92 24.1 Rplp0 1.72
4 AAATGCCCAGGA… Kerato 6345 1834 26.3 6.60 AC1342… 11.3
5 AACACGTGTTAA… Kerato 170 137 0 27.1 Hspb1 2.94
6 AACCATGCAACC… Kerato 263 203 0.760 24.0 Rpl27a 2.28
7 AACCGCGCAGAT… Kerato 7711 1582 44.1 4.42 AC1342… 14.9
8 AACTCCCGTAAT… Kerato 167 139 0.599 26.9 Rps23 2.40
9 AACTCTTCAGGT… Kerato 310 202 11.9 17.1 AC1342… 11.9
10 AACTCTTGTAAT… Kerato 205 139 10.7 12.2 AC1342… 18.5
# … with 289 more rows, and abbreviated variable names ¹orig.ident,
# ²nCount_RNA, ³nFeature_RNA, ⁴percent.mt, ⁵percent.ribosomal,
# ⁶largest_gene, ⁷percent.Largest.Gene
# ℹ Use `print(n = ...)` to see more rows
VlnPlot(kerato, features=c("percent.Largest.Gene"))
#create table of QC metrics and name largest gene.
as_tibble(
kerato[[]],
rownames="Cell.Barcode"
) -> qc.metrics
qc.metrics
qc.metrics %>%
arrange(percent.mt) %>%
ggplot(aes(nCount_RNA,nFeature_RNA,colour=percent.mt)) +
geom_point() +
scale_color_gradientn(colours=c("black","blue","green2","red","yellow")) +
ggtitle("Example of plotting QC metrics") +
geom_hline(yintercept = 200) +
geom_hline(yintercept = 6000) + theme_calc() +
theme(panel.grid.major.x = element_line(colour = 'light grey')) +
xlab("Number of counts") +
ylab("Number of features") + labs(colour = "% mitochondrial RNA")
Error in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
polygon edge not found
#plotting complexity
# The standard way of calculating this is log10(genes)/log10(counts) however this gives absolute values which are difficult to judge. A possibly better approach is to fit a line through the cloud and then calculate the difference from the observed value to the expected.
qc.metrics %>%
mutate(complexity=log10(nFeature_RNA) / log10(nCount_RNA)) -> qc.metrics
lm(log10(qc.metrics$nFeature_RNA)~log10(qc.metrics$nCount_RNA)) -> complexity.lm
complexity.lm
Call:
lm(formula = log10(qc.metrics$nFeature_RNA) ~ log10(qc.metrics$nCount_RNA))
Coefficients:
(Intercept) log10(qc.metrics$nCount_RNA)
0.7349 0.6438
qc.metrics %>%
mutate(
complexity_diff = log10(nFeature_RNA) - ((log10(qc.metrics$nCount_RNA)*complexity.lm$coefficients[2])+complexity.lm$coefficients[1])
) -> qc.metrics
qc.metrics %>%
ggplot(aes(x=complexity_diff)) +
geom_density(fill="yellow") +
xlab("Complexity differential") +
ylab("Density") +
theme_calc() + geom_vline(xintercept = 0)
Error in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
polygon edge not found
min(c(max(qc.metrics$complexity_diff),0-min(qc.metrics$complexity_diff))) -> complexity_scale
qc.metrics %>%
mutate(complexity_diff=replace(complexity_diff,complexity_diff< -0.1,-0.1)) %>%
ggplot(aes(x=log10(nCount_RNA), y=log10(nFeature_RNA), colour=complexity_diff)) +
geom_point(size=0.5) +
geom_abline(slope=complexity.lm$coefficients[2], intercept = complexity.lm$coefficients[1]) +
scale_colour_gradient2(low="blue2",mid="grey",high="red2") +
xlab("log10(counts)") +
ylab("log10(features)") +
labs(colour = "Complexity differential") +
theme_calc() +
theme(panel.grid.major.x = element_line(colour = 'light grey'))
Error in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
polygon edge not found
# add generated metadata here
complexity.diff <- qc.metrics %>% pull(complexity_diff)
kerato@meta.data <- cbind(kerato@meta.data, complexity.diff)
# kerato <- AddMetaData(object = kerato, metadata = complexity.diff, col.name = "complexity_diff")
qc.metrics.log10.scatter <- qc.metrics %>%
arrange(percent.mt) %>%
ggplot(aes(nCount_RNA,nFeature_RNA,colour=percent.mt)) +
geom_point(size=0.7) +
scale_color_gradientn(colors=c("black","blue","green2","red","yellow")) +
ggtitle("QC metrics across barcodes") +
geom_abline(slope=complexity.lm$coefficients[2], intercept = complexity.lm$coefficients[1]) +
geom_hline(yintercept = 200) +
geom_hline(yintercept = 6000) +
scale_x_log10() + scale_y_log10() +
xlab("Log10(Number of counts)") +
ylab("Log10(Number of features)") +
labs(colour = "% mitochondrial RNA") + theme_calc() +
theme(panel.grid.major.x = element_line(colour = 'light grey'))
ggsave("qc_log10_scatter.tiff", plot = print(qc.metrics.log10.scatter, device = "tiff", height = 336, width = 544, units = "px"))
Saving 7 x 7 in image
qc.metrics %>%
arrange(percent.mt) %>%
ggplot(aes(nCount_RNA,nFeature_RNA,colour=percent.ribosomal)) +
geom_point(size=0.7) +
scale_color_gradientn(colors=c("black","blue","green2","red","yellow")) +
ggtitle("QC metrics across barcodes") +
geom_hline(yintercept = 200) +
geom_hline(yintercept = 6000) + geom_abline(slope=complexity.lm$coefficients[2], intercept = complexity.lm$coefficients[1]) +
scale_x_log10() + scale_y_log10() +
xlab("Log10(Number of counts)") +
ylab("Log10(Number of features)") +
labs(colour = "% ribosomal RNA") + theme_calc() +
theme(panel.grid.major.x = element_line(colour = 'light grey'))
qc.metrics %>%
ggplot(aes(x=percent.Largest.Gene, y=percent.ribosomal, colour = complexity_diff)) +
geom_point() +
geom_smooth(method = "lm")+
xlab("% largest gene") +
ylab("% ribosomal genes") +
ylim(0, NA) +
theme_calc() +
theme(panel.grid.major.x = element_line(colour = 'light grey')) + scale_color_gradientn(colors=c("black","blue","green2","red","yellow"))
NA
qc.metrics %>%
ggplot(aes(x=percent.Largest.Gene, y=percent.mt, colour = complexity_diff)) +
geom_point() +
geom_smooth(method = "lm")+
xlab("% largest gene") +
ylab("% mitochondrial genes") +
ylim(0, NA)+
theme_calc() +
theme(panel.grid.major.x = element_line(colour = 'light grey')) + scale_color_gradientn(colors=c("black","blue","green2","red","yellow"))
qc.metrics %>%
group_by(largest_gene) %>%
count() %>%
arrange(desc(n)) -> largest_gene_list
largest_gene_list
largest_gene_list %>%
filter(n>5) %>%
pull(largest_gene) -> largest_genes_to_plot
qc.metrics %>%
filter(largest_gene %in% largest_genes_to_plot) %>%
mutate(largest_gene=factor(largest_gene, levels=largest_genes_to_plot)) %>%
arrange(largest_gene) %>%
ggplot(aes(x=log10(nCount_RNA), y=log10(nFeature_RNA), colour=largest_gene)) +
geom_point(size=1) +
scale_colour_manual(values=c("grey",RColorBrewer::brewer.pal(9,"Set1"))) +
xlab("log10(counts)") +
ylab("log10(features)") +
labs(colour = "Largest gene") +
theme_calc() +
theme(panel.grid.major.x = element_line(colour = 'light grey'))
Error in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
polygon edge not found
qc.metrics %>%
filter(largest_gene %in% largest_genes_to_plot) %>%
mutate(largest_gene=factor(largest_gene, levels=largest_genes_to_plot)) %>%
arrange(largest_gene) %>%
ggplot(aes(x=complexity_diff, y=percent.Largest.Gene, colour=largest_gene)) +
geom_point()+
scale_colour_manual(values=c("grey",RColorBrewer::brewer.pal(9,"Set1"))) +
xlab("Complexity differential") +
ylab("% largest gene") +
labs(colour = "Largest gene") +
theme_calc() +
theme(panel.grid.major.x = element_line(colour = 'light grey'))
qc.metrics %>%
arrange(percent.mt) %>%
ggplot(aes(x=complexity_diff, y=percent.Largest.Gene, colour=percent.mt)) +
geom_point() +
scale_color_gradientn(colors=c("black","blue","green2","red","yellow")) +
xlab("Complexity differential") +
ylab("% largest gene") +
labs(colour = "% mitochondrial RNA") +
theme_calc() +
theme(panel.grid.major.x = element_line(colour = 'light grey'), axis.line=element_line(color="black"), axis.line.y = element_line(color="black"), text = element_text(size = 12))
qc.metrics %>%
arrange(percent.mt) %>%
ggplot(aes(x=complexity_diff, y=percent.mt, colour=percent.Largest.Gene)) +
geom_point() +
geom_smooth(method = "lm") +
scale_color_gradientn(colors=c("black","blue","green2","red","yellow")) +
xlab("Complexity differential") +
ylab("% mitochondrial RNA") +
labs(colour = "% largest gene") +
theme_calc() +
theme(panel.grid.major.x = element_line(colour = 'light grey'), axis.line=element_line(color="black"), axis.line.y = element_line(color="black"), text = element_text(size = 12))
qc.metrics %>%
arrange(percent.ribosomal) %>%
ggplot(aes(x=complexity_diff, y=percent.Largest.Gene, colour=percent.ribosomal)) +
geom_point() +
scale_color_gradientn(colors=c("black","blue","green2","red","yellow")) +
xlab("Complexity differential") +
ylab("% largest gene") +
labs(colour = "% ribosomal mRNA") +
theme_calc() +
theme(panel.grid.major.x = element_line(colour = 'light grey'), axis.line=element_line(color="black"), axis.line.y = element_line(color="black"), text = element_text(size = 12))
qc.metrics %>%
arrange(percent.mt) %>%
ggplot(aes(x=complexity_diff, y=percent.ribosomal, colour=percent.Largest.Gene)) +
geom_point() +
geom_smooth(method = "lm") +
scale_color_gradientn(colors=c("black","blue","green2","red","yellow")) +
xlab("Complexity differential") +
ylab("% ribosomal mRNA") +
labs(colour = "% largest gene") +
theme_calc() +
theme(panel.grid.major.x = element_line(colour = 'light grey'), axis.line=element_line(color="black"), axis.line.y = element_line(color="black"), text = element_text(size = 12))
qc.metrics %>%
ggplot(aes(percent.mt)) +
geom_histogram(binwidth = 0.5, fill="yellow", colour="black") +
ggtitle("Distribution of percentage mitochondrial RNA") +
geom_vline(xintercept = 50) +
xlab("% mitochondrial RNA") +
ylab("Count") +
theme_gdocs()
NA
NA
NA
qc.metrics %>%
ggplot(aes(percent.Largest.Gene)) +
geom_histogram(binwidth = 0.7, fill="coral", colour="black") +
ggtitle("Distribution of Percentage Largest Gene") +
xlab("% largest gene") +
ylab("Count") +
theme_gdocs()
ggplot(mapping = aes(kerato@assays$RNA@data["Gapdh",])) +
geom_histogram(binwidth = 0.05, fill="coral", colour="black") +
ggtitle("GAPDH expression distribution") +
xlab("GADPH Expression") +
ylab("Count") +
theme_calc()
# FeatureScatter is typically used to visualize feature-feature relationships, but can be used
# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.
plot1 <- FeatureScatter(kerato, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(kerato, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot3 <- FeatureScatter(kerato, feature1 = "percent.mt" , feature2 = "nFeature_RNA") + xlab('Percentage mitochondrial RNA') + ylab('Barcode feature count') + theme(legend.position = "None")
plot1
plot2
plot3
summary(kerato@meta.data)
orig.ident nCount_RNA nFeature_RNA percent.mt
Kerato:299 Min. : 167 Min. : 110 Min. : 0.0000
1st Qu.: 361 1st Qu.: 229 1st Qu.: 0.5935
Median : 2449 Median : 793 Median : 5.1345
Mean : 9164 Mean :1462 Mean :14.3097
3rd Qu.: 9142 3rd Qu.:2232 3rd Qu.:26.2106
Max. :83967 Max. :6080 Max. :76.4802
percent.ribosomal largest_gene percent.Largest.Gene
Min. : 0.8006 Length:299 Min. : 1.229
1st Qu.: 7.0124 Class :character 1st Qu.: 2.344
Median :20.7843 Mode :character Median : 3.670
Mean :18.7268 Mean : 7.683
3rd Qu.:27.6145 3rd Qu.:11.691
Max. :43.7925 Max. :35.817
complexity.diff
Min. :-0.276364
1st Qu.:-0.041604
Median : 0.004741
Mean : 0.000000
3rd Qu.: 0.050976
Max. : 0.165463
# make new dataframe with superfluous info removed
keep.columns <- c("Cell.Barcode","nCount_RNA","nFeature_RNA","percent.mt","percent.ribosomal","percent.Largest.Gene")
melt.qc <- qc.metrics[keep.columns]
# melt the dataframe so boxplot of QCs can be generated
melt.qc <- melt(melt.qc, id="Cell.Barcode")
# plot violin plots of metrics
qc.metrics.violin <- ggplot(melt.qc, aes(x = variable, y = value)) +
geom_violin(aes(x = variable, y = value, fill = variable)) + geom_jitter(size = 0.2, position = position_jitter(seed= 1, width = 0.2)) + facet_wrap(~ variable, scales = "free") +
theme_calc() + theme(title = element_blank(), axis.text.x = element_blank(), strip.text = element_blank(), legend.position = c(0.85, 0.2)) +labs(colour = "Measurement", x = element_blank(), y = "Value") + scale_fill_npg(name = "Measurement", labels=c("Number of counts", "Number of features","% mtRNA", "% ribosomal genes","% largest gene" ))
qc.metrics.violin
ggsave("qc_violins.tiff", plot = print(qc.metrics.violin, device = "tiff", dpi = 400))
Saving 5.67 x 3.5 in image
# plot violin plots of metrics with logarithmic scaled values
qc.metrics.violin.log10 <- ggplot(melt.qc, aes(x = variable, y = value)) +
geom_violin(aes(x = variable, y = value, fill = variable)) + geom_jitter(size = 0.2, position = position_jitter(seed= 1, width = 0.2)) + facet_wrap(~ variable, scales = "free") + scale_y_log10()+
theme_calc() + theme(title = element_blank(), axis.text.x = element_blank(), strip.text = element_blank(), legend.position = c(0.85, 0.2)) +labs(colour = "Measurement", x = element_blank(), y = "Log10(Measurement Value)") + scale_fill_npg(name = "Measurement", labels=c("Number of counts", "Number of features","% mtRNA", "% ribosomal genes","% largest gene" ))
qc.metrics.violin.log10
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Removed 27 rows containing non-finite values (stat_ydensity).
Warning: Removed 27 rows containing missing values (geom_point).
ggsave("qc_violins_log10.tiff", plot = print(qc.metrics.violin.log10, device = "tiff", dpi = 400))
Saving 5.67 x 3.5 in image
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Removed 27 rows containing non-finite values (stat_ydensity).
Warning: Removed 27 rows containing missing values (geom_point).
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Removed 27 rows containing non-finite values (stat_ydensity).
Warning: Removed 27 rows containing missing values (geom_point).
# here is where we filter with QC metrics, look at violin plots to see number of cells excluded. Will need high mt% and low feature no. to process majority of cells
kerato <- subset(kerato, subset = nFeature_RNA > 200 & nFeature_RNA < 6000 & percent.mt < 50)
kerato_info <- kerato@meta.data %>% as.data.frame()
## extract meta data
# the resulting object has one "row" per cell
cat('Number of cells in analysis:', nrow(kerato_info))
Number of cells in analysis: 227
#number of cells pulled through using the filters above is printed to the terminal.
#log normalisation of data
kerato <- NormalizeData(kerato, normalization.method = "LogNormalize", scale.factor = 10000)
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
library(Seurat)
library("biomaRt")
# sphase_humanGenes <- cc.genes.updated.2019$s.genes
# g2mphase_humanGenes <- cc.genes.updated.2019$g2m.genes
#
# human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
# rat = useMart("ensembl", dataset = "rnorvegicus_gene_ensembl")
#
# x = sphase_humanGenes
#
# r.s.genes = getLDS(attributes = c("rgd_symbol"),
# filters = "rgd_symbol",
# values = x ,
# mart = rat,
# attributesL = c("hgnc_symbol"),
# martL = human,
# uniqueRows=T)
#
# x = g2mphase_humanGenes
# r.g2m.genes = getLDS(attributes = c("rgd_symbol"),
# filters = "rgd_symbol",
# values = x ,
# mart = rat,
# attributesL = c("hgnc_symbol"),
# martL = human,
# uniqueRows=T)
#function to check mirror access is working
# ensembl = useMart("ensembl", host="https://useast.ensembl.org")
# dim(listDatasets(ensembl))
# host="https://useast.ensembl.org",
# Basic function to convert mouse to human gene names
convertHumanGeneList <- function(x){
require("biomaRt")
# mart <- useMart("ENSEMBL_MART_ENSEMBL")
# human <- useDataset("hsapiens_gene_ensembl", mart)
human.mart <- biomaRt::useMart(host="https://dec2021.archive.ensembl.org", "ENSEMBL_MART_ENSEMBL", dataset="hsapiens_gene_ensembl")
# rat <- useDataset("rnorvegicus_gene_ensembl", mart)
rat.mart <- biomaRt::useMart(host="https://dec2021.archive.ensembl.org", "ENSEMBL_MART_ENSEMBL", dataset="rnorvegicus_gene_ensembl")
# human = useMart("ensembl", host = 'https://www.ensembl.org', dataset = "hsapiens_gene_ensembl")
# rat = useMart("ensembl", host = 'https://www.ensembl.org', dataset = "rnorvegicus_gene_ensembl")
# genesV2 = getLDS(attributes = c("rgd_symbol"),
# filters = "rgd_symbol",
# values = x ,
# mart = rat,
# attributesL = c("hgnc_symbol"),
# martL = human,
# uniqueRows=T)
genesV2 = getLDS(attributes = c("hgnc_symbol"),
filters = "hgnc_symbol",
values = x ,
mart = human.mart,
attributesL = c("rgd_symbol"),
martL = rat.mart,
uniqueRows=T)
ratx <- unique(genesV2[, 2])
# Print the first 6 genes found to the screen
print(head(ratx))
return(ratx)
}
# maybe try this : https://support.bioconductor.org/p/122534/
r.s.genes <- convertHumanGeneList(cc.genes.updated.2019$s.genes)
[1] "LOC100910528" "Mrpl36" "Wdr76" "Fen1"
[5] "LOC100911660" "Hells"
r.g2m.genes <- convertHumanGeneList(cc.genes.updated.2019$g2m.genes)
[1] "Tubb4b" "Mki67" "Kif11" "Ckap2l" "Kif20b" "Cenpe"
head(r.s.genes)
[1] "LOC100910528" "Mrpl36" "Wdr76" "Fen1"
[5] "LOC100911660" "Hells"
head(r.g2m.genes)
[1] "Tubb4b" "Mki67" "Kif11" "Ckap2l" "Kif20b" "Cenpe"
length(cc.genes.updated.2019$s.genes)
[1] 43
length(r.s.genes)
[1] 46
length(cc.genes.updated.2019$g2m.genes)
[1] 54
length(r.g2m.genes)
[1] 67
kerato <- CellCycleScoring(kerato, s.features = r.s.genes, g2m.features = r.g2m.genes, set.ident = TRUE)
Warning: The following features are not present in the object: LOC100910528, Wdr76, LOC100911660, E2f8, AC129365.1, LOC120094818, Exo1, Clspn, Dtl, not searching for symbol synonyms
Warning: The following features are not present in the object: AABR07049223.1, Cks1l, AC120066.1, RGD1559962, LOC100364016, AC112018.1, LOC120102505, LOC680565, LOC100912261, Ccnb2, Hjurp, AABR07015743.1, RGD1561694, Hmgb2, Nek2, Nek2l1, AABR07028615.2, LOC100362620, not searching for symbol synonyms
kerato[[]]
#remake column of nested list like cc.genes
# r.s.genes <- s_genes$RGD.symbol
# g2m.genes <- g2m_genes$RGD.symbol
# #nested list with equivalent titles.
# ratGenes <- list(s.genes = s_genes, g2m.genes = g2m_genes)
# kerato <- CellCycleScoring(kerato, s.features = ratGenes$s.genes, g2m.features = ratGenes$g2m.genes, set.ident = TRUE)
#this command reveals that not enough of the genes exist in the dataset for this analysis to be performed.
#even building a new seurat object with min.cells = 1 (only allows genes in that have expression in 1 cell) this fails
kerato.tbl <- as_tibble(kerato[[]]) # Replicate original data
kerato.tbl$Phase <- factor(kerato.tbl$Phase, # Change ordering manually
levels = c("G1","S","G2M"))
kerato.tbl %>%
ggplot(aes(Phase)) + geom_bar(aes(fill = Phase)) + theme_calc() + scale_fill_npg()
kerato.tbl %>%
ggplot(aes(x=S.Score, y=G2M.Score, color=Phase)) +
geom_point() +
coord_cartesian(xlim=c(-0.15,0.15), ylim=c(-0.15,0.15)) +
theme_calc() +
theme(panel.grid.major.x = element_line(colour = 'light grey'), axis.line=element_line(color="black"), axis.line.y = element_line(color="black"), text = element_text(size = 12)) +scale_color_npg()
table(kerato.tbl$Phase)
G1 S G2M
125 50 52
#finding HVGs
# vst: First, fits a line to the relationship of log(variance) and log(mean) using local polynomial regression (loess). Then standardizes the feature values using the observed mean and expected variance (given by the fitted line). Feature variance is then calculated on the standardized values after clipping to a maximum (see clip.max parameter).
kerato <- FindVariableFeatures(kerato, selection.method = "vst", nfeatures = 2000)
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(kerato), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(kerato)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
When using repel, set xnudge and ynudge to 0 for optimal results
plot1
plot2
Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider increasing max.overlaps
as_tibble(HVFInfo(kerato),rownames = "Gene") -> variance.data
variance.data %>%
mutate(hypervariable=Gene %in% VariableFeatures(kerato)
) -> variance.data
head(variance.data, n=10)
variance.data %>%
ggplot(aes(log(mean),log(variance),color=hypervariable)) +
geom_point() +
scale_color_manual(values=c("black","red")) +
labs(colour = "Hypervariable?") +
theme_calc() +
theme(panel.grid.major.x = element_line(colour = 'light grey'), axis.line=element_line(color="black"), axis.line.y = element_line(color="black"), text = element_text(size = 12))
#Next, we apply a linear transformation ('scaling') that is a standard pre-processing step prior to dimensional reduction techniques like PCA. The ScaleData function:
#Shifts the expression of each gene, so that the mean expression across cells is 0
#Scales the expression of each gene, so that the variance across cells is 1
#This step gives equal weight in downstream analyses, so that highly-expressed genes do not dominate
#The results of this are stored in pbmc[["RNA"]]@scale.data
all.genes <- rownames(kerato)
kerato <- ScaleData(kerato, features = all.genes)
Centering and scaling data matrix
|
| | 0%
|
|====== | 9%
|
|=========== | 18%
|
|================= | 27%
|
|======================= | 36%
|
|============================ | 45%
|
|================================== | 55%
|
|======================================= | 64%
|
|============================================= | 73%
|
|=================================================== | 82%
|
|======================================================== | 91%
|
|==============================================================| 100%
#run the PCA analysis of the dataset
kerato <- RunPCA(kerato, features = VariableFeatures(object = kerato))
PC_ 1
Positive: Desi1, Abhd8, Bloc1s1, Dctpp1, Ftl1, Erlec1, Lrrc59, Nans, Csnk1g2, Cmtm7
Hspe1, Ybx3, Rpa3, Fkbp2, Zcrb1, AABR07035541.2, Bola3, Cdk2ap1, Tceal9, Atp6v1d
Rps18, Cdkn2aipnl, Mrps6, Tipinl1, Sms, LOC100360087, Bola2, Tctex1d2, Uck2, Mrpl36
Negative: Krt17, Cnfn, Krtdap, Krt10, Hspb1, Fabp5, Sprr1a, Klk6, Krt16, Dmkn
Mfap5, Ly6d, Elovl4, Cryab, Tmem251, AC139608.2, Lrba, Sdr16c6, Atp6v0a4, Nfkbie
Wdr41, Endou, Sbsn, Slc34a3, Nr5a2, Tgm3, Klk7, Cenpf, Cdkn1c, Lipn
PC_ 2
Positive: Cldn4, Psapl1, Dsg3, Tacstd2, Cyp4f39, Ppl, Spint1, Sptbn2, Tgm1, Tmem45b
Gdpd3, Prss27, Acsl1, Evpl, Clca2, Fam129b, Sbsn, Ide, Serpinb11, Grhl3
Dmkn, Gprc5a, Serpinb2, Ctsa, Krt17, Nectin2, Atl2, Hspa5, Atp6ap2, Hk2
Negative: LOC100364435, Rps23, Rpl21.3, Rps2, Rplp0, Pdgfa, Rps12, Lmo1, Mt2A, Rps27a.1
Ftl1, Rps15a, Cavin3, Rps14, Slc25a4, Cav1, Rgs10, Hmgn2, Mt1, H2afv
Tmsb4x, Wnt6, AABR07026311.1, Stmn1, LOC100360087, Fth1, Emp3, Cda, Wnt10a, Ifitm3
PC_ 3
Positive: Fabp5, Gltp, Rps23, Rpl21.3, Tmsb4x, Rplp0, Rps12, Rps15a, Rps2, Rps14
Rps27a.1, Hopx, Gpx2, Perp, Cysrt1, Gng5, Ly6d, Rab25, Dbi, Arpc1a
Dmkn, LOC100364435, Sult2b1, Sprr1a, Krtdap, S100a16, Rbp2, Fth1, Msrb1, Gng12
Negative: Snhg11, Klf8, Tnc, Slc2a1, Col3a1, Ctsl, C1s, C1r, Hspa5, Thbs2
Ptgs2, Dpp7, Jag2, Fn1, Slc1a3, Clu, Fst, Cers4, Igfbp2, Ccdc80
Lamb1, Apoe, Wnt6, Gigyf1, Hsp90b1, Slc1a5, Pdgfa, Phpt1, Igfbp7, Dsg3
PC_ 4
Positive: Fstl1, C1s, Igfbp2, Kitlg, Igfbp5, Jag2, Tmem205, Akr1b1, Ifi27, Slc1a3
Arl4a, Lef1, Traf1, Tspan17, Slc25a4, Tp53i11, Tmem136, Apoe, Rerg, Akap12
Fam117a, S100a4, Tnfrsf21, Lztr1, Spon2, Mrps6, Cers4, Cnpy4, Igfbp7, Zfp358
Negative: Mki67, Racgap1, Cenpe, Cdca3, Hmgb2l1, Cenpa, Psrc1, Smc2, Ckap2, Kif20b
Tpx2, Cks2, Arhgap11a, Ndc80, Kif2c, Cenpm, Cdca2, LOC100359539, Cenpf, Aurka
Ect2, Kifc1, Top2a, Spag5.1, Fam83d, Sgo2, AABR07069282.1, Prc1, Kif11, Spc24
PC_ 5
Positive: Fam92a, Ahcyl1, Ptpn12, Clca2, AABR07044001.4, Chn2, Prkag2, Jup, Dsg3, Serinc5
Hist1h1b, Hist1h2an, Ldah, Pacrgl, Bspry, Bfsp2, Gatad1, Ptpn21, Aktip, RGD1560394
Epha2, Fam131c, Map2, Pkib, AABR07049695.3, Dnm1l, RGD1309036, Dph3, Ocel1, Snhg11
Negative: Hspb1, Glod4, Krt17, Anxa1, Sprr1a, Cnfn, Krt15, Ldha, Psmb6, Krt10
Fdps, Anxa2, Ttk, Tuba1c, Gstp1, Dstn, Tuba1b, Dbi, Anxa8, Prdx1
Anp32e, Ttc5, Actb, Tubb6, Mdh1, Prkar1a, Kifc1, Klk6, Kif20a, Sec13
# Examine and visualize PCA results a few different ways
print(kerato[["pca"]], dims = 1:5, nfeatures = 5)
PC_ 1
Positive: Desi1, Abhd8, Bloc1s1, Dctpp1, Ftl1
Negative: Krt17, Cnfn, Krtdap, Krt10, Hspb1
PC_ 2
Positive: Cldn4, Psapl1, Dsg3, Tacstd2, Cyp4f39
Negative: LOC100364435, Rps23, Rpl21.3, Rps2, Rplp0
PC_ 3
Positive: Fabp5, Gltp, Rps23, Rpl21.3, Tmsb4x
Negative: Snhg11, Klf8, Tnc, Slc2a1, Col3a1
PC_ 4
Positive: Fstl1, C1s, Igfbp2, Kitlg, Igfbp5
Negative: Mki67, Racgap1, Cenpe, Cdca3, Hmgb2l1
PC_ 5
Positive: Fam92a, Ahcyl1, Ptpn12, Clca2, AABR07044001.4
Negative: Hspb1, Glod4, Krt17, Anxa1, Sprr1a
#visualise the PCA coordinates of genes
VizDimLoadings(kerato, dims = 1, nfeatures = 20, reduction = "pca") + coord_flip() + scale_x_reverse() + theme(axis.text.x = element_text(size = 6,angle = 45, vjust=1, hjust = 1))+ scale_colour_npg()
# plot cells using two PCAs as axis.
DimPlot(kerato, reduction = "pca")
names(qc.metrics)
[1] "Cell.Barcode" "orig.ident"
[3] "nCount_RNA" "nFeature_RNA"
[5] "percent.mt" "percent.ribosomal"
[7] "largest_gene" "percent.Largest.Gene"
[9] "complexity" "complexity_diff"
# umap_largest_genes_1 <- DimPlot(kerato, reduction="umap", group.by = "largest_gene",label = TRUE, label.size = 3)
# umap_largest_genes_2 <- LabelPoints(plot = umap_largest_genes_1, points = largest_genes_to_plot, repel = TRUE)
#
#
# umap_largest_genes_1
# umap_largest_genes_2
# plot1 <- VariableFeaturePlot(kerato)
# plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
# plot1
# plot2
#In particular DimHeatmap allows for easy exploration of the primary sources of heterogeneity in a dataset, and can be useful when trying to decide which PCs to include for further downstream analyses. Both cells and features are ordered according to their PCA scores. Setting cells to a number plots the 'extreme' cells on both ends of the spectrum, which dramatically speeds plotting for large datasets. Though clearly a supervised analysis, we find this to be a valuable tool for exploring correlated feature sets.
DimHeatmap(kerato, dims = 1:2, cells = 300, balanced = TRUE)
Warning: Requested number is larger than the number of available items (227). Setting to 227.
Warning: Requested number is larger than the number of available items (227). Setting to 227.
DimHeatmap(kerato, dims = 3:4, cells = 300, balanced = TRUE)
Warning: Requested number is larger than the number of available items (227). Setting to 227.
Warning: Requested number is larger than the number of available items (227). Setting to 227.
DimHeatmap(kerato, dims = 5:6, cells = 300, balanced = TRUE)
Warning: Requested number is larger than the number of available items (227). Setting to 227.
Warning: Requested number is larger than the number of available items (227). Setting to 227.
DimHeatmap(kerato, dims = 7:8, cells = 300, balanced = TRUE)
Warning: Requested number is larger than the number of available items (227). Setting to 227.
Warning: Requested number is larger than the number of available items (227). Setting to 227.
# NOTE: This process can take a long time for big datasets, comment out for expediency. More approximate techniques such as those implemented in ElbowPlot() can be used to reduce computation time
kerato <- JackStraw(kerato, num.replicate = 100)
| | 0 % ~calculating
|+ | 1 % ~14s
|+ | 2 % ~13s
|++ | 3 % ~13s
|++ | 4 % ~13s
|+++ | 5 % ~14s
|+++ | 6 % ~14s
|++++ | 7 % ~13s
|++++ | 8 % ~13s
|+++++ | 9 % ~13s
|+++++ | 10% ~13s
|++++++ | 11% ~13s
|++++++ | 12% ~12s
|+++++++ | 13% ~13s
|+++++++ | 14% ~12s
|++++++++ | 15% ~13s
|++++++++ | 16% ~13s
|+++++++++ | 17% ~13s
|+++++++++ | 18% ~12s
|++++++++++ | 19% ~12s
|++++++++++ | 20% ~12s
|+++++++++++ | 21% ~12s
|+++++++++++ | 22% ~12s
|++++++++++++ | 23% ~12s
|++++++++++++ | 24% ~12s
|+++++++++++++ | 25% ~12s
|+++++++++++++ | 26% ~11s
|++++++++++++++ | 27% ~11s
|++++++++++++++ | 28% ~11s
|+++++++++++++++ | 29% ~11s
|+++++++++++++++ | 30% ~11s
|++++++++++++++++ | 31% ~11s
|++++++++++++++++ | 32% ~11s
|+++++++++++++++++ | 33% ~10s
|+++++++++++++++++ | 34% ~10s
|++++++++++++++++++ | 35% ~10s
|++++++++++++++++++ | 36% ~10s
|+++++++++++++++++++ | 37% ~10s
|+++++++++++++++++++ | 38% ~10s
|++++++++++++++++++++ | 39% ~10s
|++++++++++++++++++++ | 40% ~09s
|+++++++++++++++++++++ | 41% ~09s
|+++++++++++++++++++++ | 42% ~09s
|++++++++++++++++++++++ | 43% ~09s
|++++++++++++++++++++++ | 44% ~09s
|+++++++++++++++++++++++ | 45% ~09s
|+++++++++++++++++++++++ | 46% ~08s
|++++++++++++++++++++++++ | 47% ~08s
|++++++++++++++++++++++++ | 48% ~08s
|+++++++++++++++++++++++++ | 49% ~08s
|+++++++++++++++++++++++++ | 50% ~08s
|++++++++++++++++++++++++++ | 51% ~08s
|++++++++++++++++++++++++++ | 52% ~08s
|+++++++++++++++++++++++++++ | 53% ~07s
|+++++++++++++++++++++++++++ | 54% ~07s
|++++++++++++++++++++++++++++ | 55% ~07s
|++++++++++++++++++++++++++++ | 56% ~07s
|+++++++++++++++++++++++++++++ | 57% ~07s
|+++++++++++++++++++++++++++++ | 58% ~07s
|++++++++++++++++++++++++++++++ | 59% ~06s
|++++++++++++++++++++++++++++++ | 60% ~06s
|+++++++++++++++++++++++++++++++ | 61% ~06s
|+++++++++++++++++++++++++++++++ | 62% ~06s
|++++++++++++++++++++++++++++++++ | 63% ~06s
|++++++++++++++++++++++++++++++++ | 64% ~06s
|+++++++++++++++++++++++++++++++++ | 65% ~06s
|+++++++++++++++++++++++++++++++++ | 66% ~05s
|++++++++++++++++++++++++++++++++++ | 67% ~05s
|++++++++++++++++++++++++++++++++++ | 68% ~05s
|+++++++++++++++++++++++++++++++++++ | 69% ~05s
|+++++++++++++++++++++++++++++++++++ | 70% ~05s
|++++++++++++++++++++++++++++++++++++ | 71% ~05s
|++++++++++++++++++++++++++++++++++++ | 72% ~04s
|+++++++++++++++++++++++++++++++++++++ | 73% ~04s
|+++++++++++++++++++++++++++++++++++++ | 74% ~04s
|++++++++++++++++++++++++++++++++++++++ | 75% ~04s
|++++++++++++++++++++++++++++++++++++++ | 76% ~04s
|+++++++++++++++++++++++++++++++++++++++ | 77% ~04s
|+++++++++++++++++++++++++++++++++++++++ | 78% ~03s
|++++++++++++++++++++++++++++++++++++++++ | 79% ~03s
|++++++++++++++++++++++++++++++++++++++++ | 80% ~03s
|+++++++++++++++++++++++++++++++++++++++++ | 81% ~03s
|+++++++++++++++++++++++++++++++++++++++++ | 82% ~03s
|++++++++++++++++++++++++++++++++++++++++++ | 83% ~03s
|++++++++++++++++++++++++++++++++++++++++++ | 84% ~03s
|+++++++++++++++++++++++++++++++++++++++++++ | 85% ~02s
|+++++++++++++++++++++++++++++++++++++++++++ | 86% ~02s
|++++++++++++++++++++++++++++++++++++++++++++ | 87% ~02s
|++++++++++++++++++++++++++++++++++++++++++++ | 88% ~02s
|+++++++++++++++++++++++++++++++++++++++++++++ | 89% ~02s
|+++++++++++++++++++++++++++++++++++++++++++++ | 90% ~02s
|++++++++++++++++++++++++++++++++++++++++++++++ | 91% ~01s
|++++++++++++++++++++++++++++++++++++++++++++++ | 92% ~01s
|+++++++++++++++++++++++++++++++++++++++++++++++ | 93% ~01s
|+++++++++++++++++++++++++++++++++++++++++++++++ | 94% ~01s
|++++++++++++++++++++++++++++++++++++++++++++++++ | 95% ~01s
|++++++++++++++++++++++++++++++++++++++++++++++++ | 96% ~01s
|+++++++++++++++++++++++++++++++++++++++++++++++++ | 97% ~00s
|+++++++++++++++++++++++++++++++++++++++++++++++++ | 98% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 99% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=16s
| | 0 % ~calculating
|+++ | 5 % ~00s
|+++++ | 10% ~00s
|++++++++ | 15% ~00s
|++++++++++ | 20% ~00s
|+++++++++++++ | 25% ~00s
|+++++++++++++++ | 30% ~00s
|++++++++++++++++++ | 35% ~00s
|++++++++++++++++++++ | 40% ~00s
|+++++++++++++++++++++++ | 45% ~00s
|+++++++++++++++++++++++++ | 50% ~00s
|++++++++++++++++++++++++++++ | 55% ~00s
|++++++++++++++++++++++++++++++ | 60% ~00s
|+++++++++++++++++++++++++++++++++ | 65% ~00s
|+++++++++++++++++++++++++++++++++++ | 70% ~00s
|++++++++++++++++++++++++++++++++++++++ | 75% ~00s
|++++++++++++++++++++++++++++++++++++++++ | 80% ~00s
|+++++++++++++++++++++++++++++++++++++++++++ | 85% ~00s
|+++++++++++++++++++++++++++++++++++++++++++++ | 90% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++ | 95% ~00s
|++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=00s
kerato <- ScoreJackStraw(kerato, dims = 1:20)
JackStrawPlot(kerato, dims = 1:15)
Warning: Removed 24054 rows containing missing values (geom_point).
#elbow plot , shows SD of PCs, elbow is where significance should begin to be negligible.
ElbowPlot(kerato)
#Here is where we optimise the number of PCs used to cluster the cells, and the resolution of the clustering algorithm.
kerato <- FindNeighbors(kerato, dims = 1:7)
Computing nearest neighbor graph
Computing SNN
kerato <- FindClusters(kerato, resolution = 0.4)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
Number of nodes: 227
Number of edges: 5790
Running Louvain algorithm...
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Maximum modularity in 10 random starts: 0.8002
Number of communities: 4
Elapsed time: 0 seconds
head(Idents(kerato), 5)
AAACGGGCAGGGTTAG-1 AAAGATGCAGTCGTGC-1 AAAGATGGTCTAACGT-1
0 2 0
AAATGCCCAGGAATGC-1 AACCATGCAACCGCCA-1
1 0
Levels: 0 1 2 3
#Plot UMAP
kerato <- RunUMAP(kerato, dims = 1:7)
16:34:54 UMAP embedding parameters a = 0.9922 b = 1.112
16:34:54 Read 227 rows and found 7 numeric columns
16:34:54 Using Annoy for neighbor search, n_neighbors = 30
16:34:54 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:34:54 Writing NN index file to temp file /var/folders/17/t98thp9n3zb7xfm2pnqtq84w0000gn/T//RtmpGR52GC/filed9d8b70adc2
16:34:54 Searching Annoy index using 1 thread, search_k = 3000
16:34:54 Annoy recall = 100%
16:34:56 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
16:34:57 Initializing from normalized Laplacian + noise (using irlba)
16:34:57 Commencing optimization for 500 epochs, with 7930 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
16:34:58 Optimization finished
DimPlot(kerato, reduction = "umap")
FeaturePlot(kerato, feature = "complexity.diff")
# DimPlot(kerato, reduction = "pca")
kerato@meta.data %>%
group_by(seurat_clusters,Phase) %>%
count() %>%
group_by(seurat_clusters) %>%
mutate(percent=100*n/sum(n)) %>%
ungroup() %>%
ggplot(aes(x=seurat_clusters,y=percent, fill=Phase)) +
geom_col() +
ggtitle("Percentage of cell cycle phases per cluster") + theme_calc() + labs( x = "Cluster", y = "Percent")
VlnPlot(kerato,features="percent.Largest.Gene") + labs(title = "Percentage largest gene", x = "Cluster")
VlnPlot(kerato,features="percent.ribosomal") + labs(title = "Percentage ribosomal genes", x = "Cluster")
VlnPlot(kerato,features="percent.mt") + labs(title = "Percentage mitochondrial genes", x = "Cluster")
VlnPlot(kerato,features="nFeature_RNA") + labs(title = "Features per barcode", x = "Cluster")
VlnPlot(kerato,features="nCount_RNA") + labs(title = "Counts per barcode", x = "Cluster")
NA
NA
kerato@reductions$umap@cell.embeddings %>%
as_tibble() %>%
add_column(seurat_clusters=kerato$seurat_clusters, largest_gene=kerato$largest_gene) %>%
filter(largest_gene %in% largest_genes_to_plot) %>%
ggplot(aes(x=UMAP_1, y=UMAP_2, colour=seurat_clusters)) +
geom_point() +
facet_wrap(vars(largest_gene)) + theme_clean()
kerato@reductions$umap@cell.embeddings %>%
as_tibble() %>%
add_column(seurat_clusters=kerato$seurat_clusters, Phase=kerato$Phase) %>%
ggplot(aes(x=UMAP_1, y=UMAP_2, colour=seurat_clusters)) +
geom_point() +
facet_wrap(vars(Phase)) + theme_clean()
#You can save the object at this point so that it can easily be loaded back in without having to rerun the computationally intensive steps performed above, or easily shared with collaborators.
saveRDS(kerato, file = "smalllargekera.rds")
old.kerato <- readRDS("pseudoSLkera.rds")
DimPlot(old.kerato, reduction = "umap")
DimPlot(kerato, reduction = "umap")
# plot QC metrics on UMAP
FeaturePlot(kerato,feature = "complexity.diff") + scale_color_gradientn(colors=c("black","purple","yellow")) + labs(x = "UMAP 1", y = "UMAP 2", title = "Complexity Differential")
Scale for 'colour' is already present. Adding another scale for
'colour', which will replace the existing scale.
FeaturePlot(kerato,feature = "nCount_RNA") + scale_color_gradientn(colors=c("black","purple","yellow")) + labs(x = "UMAP 1", y = "UMAP 2", title = "Counts per barcode")
Scale for 'colour' is already present. Adding another scale for
'colour', which will replace the existing scale.
FeaturePlot(kerato,feature = "nFeature_RNA") + scale_color_gradientn(colors=c("black","purple","yellow")) + labs(x = "UMAP 1", y = "UMAP 2", title = "Features per barcode")
Scale for 'colour' is already present. Adding another scale for
'colour', which will replace the existing scale.
FeaturePlot(kerato,feature = "percent.mt") + scale_color_gradientn(colors=c("black","purple","yellow")) + labs(x = "UMAP 1", y = "UMAP 2", title = "Percentage mitochondrial genes")
Scale for 'colour' is already present. Adding another scale for
'colour', which will replace the existing scale.
FeaturePlot(kerato,feature = "percent.ribosomal") + scale_color_gradientn(colors=c("black","purple","yellow")) + labs(x = "UMAP 1", y = "UMAP 2", title = "Percentage ribosomal genes")
Scale for 'colour' is already present. Adding another scale for
'colour', which will replace the existing scale.
FeaturePlot(kerato,feature = "percent.Largest.Gene") + scale_color_gradientn(colors=c("black","purple","yellow")) + labs(x = "UMAP 1", y = "UMAP 2", title = "Percentage largest genes")
Scale for 'colour' is already present. Adding another scale for
'colour', which will replace the existing scale.
basal.kerato.markers <- c("Krt14","Krt5","Cdh3","Krt15","Col17a1","Tp73","Fam83g","Fgfr3","Tp63","Bcl11b")
spinous.kerato.markers <- c("Cdh1","Krt1","Krt10","Dsg1","Lgal57","Hopx","Cyp4f22","Grhl1","Prgs1","Klk9")
granular.kerato.markers <- c("Dsc1","Krt2","Ivl","Tgm3","Kprp","Pof1b","Dnase1l2","Trex2","Otx1","Eps8l1","Card18")
FeaturePlot(kerato,feature = "Krt14")
Error: Unable to find a DimReduc matching one of 'umap', 'tsne', or 'pca', please specify a dimensional reduction to use